View Javadoc

1   
2   /*
3    * SmartCrawler
4    *
5    * $Id: LinkBuilderImpl.java,v 1.7 2005/07/08 12:09:08 vincool Exp $
6    * Copyright 2005 Davide Pozza
7    *
8    * This program is free software; you can redistribute it
9    * and/or modify it under the terms of the GNU General Public
10   * License as published by the Free Software Foundation;
11   * either version 2 of the License, or (at your option) any
12   * later version.
13   *
14   * This program is distributed in the hope that it will be
15   * useful, but WITHOUT ANY WARRANTY; without even the implied
16   * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17   * PURPOSE. See the GNU General Public License for more
18   * details.
19   *
20   * You should have received a copy of the GNU General Public
21   * License along with this program; if not, write to the Free
22   * Software Foundation, Inc., 59 Temple Place, Suite 330,
23   * Boston, MA 02111-1307 USA
24   *
25   */
26  
27  package org.smartcrawler.extractor;
28  
29  import org.apache.log4j.Logger;
30  import org.smartcrawler.common.Link;
31  import org.smartcrawler.common.MalformedLinkException;
32  import org.smartcrawler.common.SCLogger;
33  
34  /***
35   *
36   *
37   * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
38   * @version <tt>$Revision: 1.7 $</tt>
39   */
40  public class LinkBuilderImpl implements LinkBuilder {
41  
42  
43      private static Logger log = SCLogger.getLogger(LinkBuilderImpl.class);
44      private static Logger logLink = SCLogger.getLinkLogger();
45  
46      private Link parsedPageLink;
47      private String parsedPagePath;
48      private String hostName;
49  
50      /***
51       * Creates a new instance of LinkBuilder
52       * @param parsedPageLink
53       */
54      public LinkBuilderImpl(Link parsedPageLink) {
55          this.parsedPageLink = parsedPageLink;
56          try {
57              hostName = parsedPageLink.getHost();
58              log.debug("LinkBuilderImpl(): hostName="+hostName);
59              parsedPagePath = parsedPageLink.getPath(false);
60  
61          } catch (Exception e){
62              hostName = null;
63              log.debug("LinkBuilderImpl(): Invalid link " + parsedPageLink);
64          }
65      }
66  
67      /***
68       *
69       * @param htmlURL
70       * @return
71       */
72      public Link buildLink(HtmlURL htmlURL) throws MalformedLinkException {
73  
74          log.debug("buildLink(): BEGIN");
75  
76          String extractedURL = htmlURL.getCleanedLinkAsString();
77  
78          Link res = null;
79          //String strCurrItemLink = currItemLink.toString();
80          log.debug("buildLink(): normalizing: " + extractedURL
81                  + " of type " + HtmlURL.LINK_ABSOLUTE_URI);
82          //String tmpExtractedLink = this.cleanedURL;
83          String tmpLinkStr = "";
84  
85          //validity check
86          if(!htmlURL.isValid()) {
87              log.debug("buildLink(): Invalid link " + extractedURL);
88              return null;
89  
90          }else if (htmlURL.getType() == HtmlURL.LINK_ABSOLUTE_URI) {
91              //
92              // ex. "/path1/file.htm"
93              //
94              tmpLinkStr = HtmlURL.PROTOCOL_PREF +
95                  hostName + extractedURL;
96  
97          } else if (htmlURL.getType() == HtmlURL.LINK_ABSOLUTE_URL) {
98              //
99              //ex. "http://www.satollo.com/path1/file.htm"
100             //
101             tmpLinkStr = extractedURL;
102 
103         } else if (htmlURL.getType() == HtmlURL.LINK_RELATIVE) {
104             //
105             // ex. "../path1/file.htm" or "path1/file.htm" and not "/path1/file.htm"
106             //
107 
108             tmpLinkStr = HtmlURL.PROTOCOL_PREF + hostName;//Ex. http://www.brucalipto.org
109             String tmpExtractedURL = extractedURL;//ex. images/95.png
110             String newLinkPath = parsedPagePath;//ex. /
111 
112 
113             //log.debug("buildLink(): tmpLinkStr=" + tmpLinkStr);
114             //log.debug("buildLink(): tmpExtractedURL=" + tmpExtractedURL);
115             //log.debug("buildLink(): newLinkPath=" + newLinkPath);
116             //log.debug("buildLink(): parsedPagePath=" + parsedPagePath);
117 
118             if (!tmpExtractedURL.startsWith("../")) {
119                 tmpLinkStr += newLinkPath + HtmlURL.PATH_SEP + tmpExtractedURL;
120 
121             } else {
122                 //int dummy = 0;
123                 while(tmpExtractedURL.startsWith("../")) {
124                     //remove the last level on newLinkPath and
125                     //the first ../ on the link
126 
127                     /*
128                     logLink.info(
129                              "WHILE BEFORE: " + (++dummy) +
130                              " parsedPageLink=" + parsedPageLink +
131                              " parsedPagePath=" + parsedPagePath +
132                              " tmpExtractedURL="+tmpExtractedURL +
133                              " newLinkPath=" + newLinkPath +
134                              " tmpLinkStr=" + tmpLinkStr);
135                      */
136                     if (newLinkPath.length() > 0) {
137                         int idx = newLinkPath.lastIndexOf(HtmlURL.PATH_SEP);
138                         if (idx >= 0) newLinkPath = newLinkPath.substring(0, idx);
139                     }
140                     tmpExtractedURL = tmpExtractedURL.substring(3);
141                     tmpLinkStr += HtmlURL.PATH_SEP + newLinkPath;
142                     /*
143                     logLink.info(
144                              "WHILE AFTER: " + (dummy) +
145                              " parsedPageLink=" + parsedPageLink +
146                              " parsedPagePath=" + parsedPagePath +
147                              " tmpExtractedURL="+tmpExtractedURL +
148                              " newLinkPath=" + newLinkPath +
149                              " tmpLinkStr=" + tmpLinkStr);
150                      */
151 
152                 }// while(tmpExtractedURL.startsWith("../")) {
153 
154                 tmpLinkStr += tmpExtractedURL;
155             }
156 
157         } else {
158             log.warn("buildLink(): url " + extractedURL + " UNHANDLED!! ");
159         }
160         res = new Link(tmpLinkStr);
161 
162         logLink.info(parsedPageLink + " " + parsedPagePath
163                 + " " + extractedURL + " " + res);
164 
165         log.debug("buildLink(): curr. level: " + parsedPageLink +
166                 " orig. link: " + extractedURL +
167                 "; normalized: " + res);
168 
169         log.debug("buildLink(): END");
170         return res;
171     }
172 
173 }